# Analyzing the distribution of benefits 
# - Use AP3 source-receptor matrix to calculate county level benefits
#   - Need change in emissions from baseline to optimal 
#   - (I think that's all since damages are linear)
# - Match counties to census block groups 
#   - Use census for CBG level demographics 
#   - Income, race, education 
# - Run analysis similar to XX 
#   - Lorenz curve 
#   - Benefits per capita ~ income 
#   - Benefits per capita ~ income decile x race 
#   - Regress benefits on income, urban, shares of races
library(pacman)
p_load(
  here, data.table, fst, readxl, magrittr, stringr
)
theme_set(theme_minimal())

# Loading data ----------------------------------------------------------------
  # TODO: Get source-receptor matrix from Nick Muller
  # Loading AP3 fips codes
  fips_dt = read_xlsx(
    path = here(
      'Data',
      'Marginal Damages (2011) from Holland, Mansur, Muller, Yates AER forthcoming.xlsx'
    ),
    sheet = 'ground-level MD per ton'
    ) %>%  
    data.table() %>% 
    .[,.( # Miami-Dade changed their fips code
      fips = fcase(
        fips == '12025', '12086',
        fips != '12025', str_pad(fips, 5,'left','0')
      )
    )]
  # Loading marginal damages
  md_dt = 
    rbind(
      cbind(
        fips_dt,
        fread(here("Data/AP3/JointFolder/md_L_2014_CS.csv"))[,stack_height := 'low']
      ),
      cbind(
        fips_dt,
        fread(here("Data/AP3/JointFolder/md_M_2014_CS.csv"))[,stack_height := 'medium']
      )
    ) |> 
    setnames(new = c('county_fips','nh3','nox','pm25','so2','voc','stack_height')) |>
    melt(
      id.vars = c('county_fips','stack_height'),
      variable.name = 'pollutant',
      value.name = 'marginal_damage'
    )
  # Creating synthetic source receptor matrix bc can't find actual 
  source_rec_dt = 
    CJ(
      county_fips = fips_dt$fips,
      receptor = fips_dt$fips
    ) |>
    merge(
      md_dt[pollutant %in% c('nox','so2','pm25')],
      by = 'county_fips',
      allow.cartesian = TRUE
    ) %>%
    .[,marginal_damage := marginal_damage/nrow(fips_dt)] |>
    setkey(county_fips, stack_height, pollutant)

  # Now loading emissions data for each plant 
  # TODO: Get emissions by plant for different simulations from Mark 
  # Plant info table
  plant_info_dt = 
    read.fst(
      path = here("Data/electricity-generation/plant-info-dt-oge.fst"),
      as.data.table = TRUE
    )[year == 2019, .SD[which.min(year)], by= plant_id_eia]|>
    setkey(plant_id_eia)
  # For now, using actual emissions 
  emissions_dt = 
    fread(here(
        'Data/electricity-generation/output/emissions-rate-avg-oge.csv'
    )) |>
    melt(id.vars = c('plant_id_eia','tot_net_generation_mwh')) |>
    merge(
      plant_info_dt, 
      by = 'plant_id_eia'
    ) %>%
    .[,.(tons = sum(value*tot_net_generation_mwh)),
      keyby = .(
        county_fips = fips,
        # For now we only have low/medium damages.
        stack_height = fcase(
          stack_height < 250, 'low',
          stack_height >= 250, 'medium',# & stack_height <= 500,
          #stack_height > 500, 'tall'
          default = 'low'
        ),
        pollutant = str_remove(variable, '_tons_per_mwh')
      )
    ]
  # Artificially creating two simulations 
  # TODO: Calculate change in emissions between simulations  
  sim_emissions_dt = 
    rbind(
      emissions_dt[,.(county_fips, stack_height, pollutant, diff_tons = tons*0.02, sim = "Welfare Maximizing")],
      emissions_dt[,.(county_fips, stack_height, pollutant, diff_tons = tons*0.04, sim = "Damage Minimizing")]
    )
  # Loading deepsolar data 
  deepsolar_dt = 
    fread(
      here("Data/deep-solar/deepsolar_tract.csv")
    )[population> 0 & !is.na(median_household_income)][,':='(
      fips = str_pad(fips, width = 11, side = "left", pad = "0"),
      solar_systems_per_cap = ifelse(
        population == 0, 0, 
        solar_system_count/population
      ),
      solar_systems_residential_per_cap = ifelse(
        population == 0, 0, 
        solar_system_count_residential/population
      ),
      pr_rad = avg_electricity_retail_rate*daily_solar_radiation
    )]
  # Getting county fips from tract fips 
  deepsolar_dt[,county_fips := str_sub(fips, 1,5)]


# Calculating benefits received by county -------------------------------------
  # Merging emissions with marginal damages 
  damage_rec_pollutant_dt = 
    merge(
      sim_emissions_dt,
      source_rec_dt,
      by = c('county_fips','stack_height','pollutant')
    )[,.(
      tot_ben_rec = sum(diff_tons*marginal_damage)),
      keyby = .(county_fips = receptor, pollutant, sim)
    ]
  # Aggregating across pollutants 
  damage_rec_dt = damage_rec_pollutant_dt[,.(
    tot_ben_rec = sum(tot_ben_rec)),
    keyby = .(county_fips, sim)
  ]
  # Merging with census tracts 
  result_dt =
    merge(
      damage_rec_dt,
      deepsolar_dt,
      by = 'county_fips',
      allow.cartesian = TRUE #for multiple sim types
    )
  # Calculating benefits per capita
  result_dt[,tot_ben_rec_pc := tot_ben_rec/population]
# Analysis --------------------------------------------------------------------
  # Lorenz curve 
  lorenz_dt = 
    rbind(
      # First calculating for benefits
      result_dt[
        order(sim, tot_ben_rec),.(
          cuml_frac = cumsum(tot_ben_rec)/sum(tot_ben_rec),
          cuml_pop = cumsum(population)/sum(population)),
        by = .(sim, type = rep('Total benefits',nrow(result_dt)))
      ],
      # Now calculating for income 
      result_dt[
        order(sim, median_household_income),.(
          cuml_frac = cumsum(median_household_income)/sum(median_household_income),
          cuml_pop = cumsum(population)/sum(population)),
        by = .(sim, type = rep('Median income',nrow(result_dt)))
      ]
    )
  # Plotting!
  ggplot(lorenz_dt, aes(x = cuml_pop, y = cuml_frac, color = type)) + 
    geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + 
    geom_line() + 
    scale_color_brewer(name = '', palette = 'Dark2') +
    labs(
      x = 'Cumulative fraction of population',
      y = 'Cumulative fraction'
    ) +
    facet_wrap(~sim)
  # Regressions: Benefits per cap on share of race, income, urban 
  p_load(fixest)
  distr_mod = feols(
    data = result_dt,
    fml = tot_ben_rec_pc ~ 
      median_household_income + 
      I(race_asian/population) + 
      I(race_black_africa/population) + 
      I(race_indian_alaska/population) + 
      I(race_islander/population) + 
      I(race_other/population) + 
      I(race_two_more/population) 
      | sw0(state),
      cluster = ~state,
      split = ~sim
  )
  etable(distr_mod)
  # Plotting income vs benefits received
    ggplot(result_dt, aes(
      x = median_household_income, 
      y = tot_ben_rec_pc, 
      color = sim, 
      weight = population
    )) + 
    geom_smooth() + 
    scale_color_brewer(name = 'Simulation', palette = 'Dark2') +
    scale_x_continuous(
      name = 'Median Household Income',
      labels = scales::label_dollar(scale = 1e-3, suffix = "K")
    ) + 
    scale_y_continuous(
      name = 'Environmental Benefits per capita',
      labels = scales::label_dollar()
    )
  